{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mini Batch SGD classifier and regressor\n",
    "Mini Batch SGD (MBSGD) models are linear models which are fitted by minimizing a regularized empirical loss with mini-batch SGD. In this notebook we compare the performance of cuMl's MBSGD classifier and regressor models with their respective scikit-learn counterparts.\n",
    "\n",
    "The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or cuda_array_interface-compliant), as well as cuDF DataFrames as the input.\n",
    "\n",
    "For information about cuDF, refer to the cuDF documentation: https://docs.rapids.ai/api/cudf/stable/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf as gd\n",
    "import cuml\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "\n",
    "from sklearn import linear_model\n",
    "from sklearn.datasets.samples_generator import make_classification, make_regression\n",
    "from sklearn.metrics import accuracy_score, r2_score\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define parameters\n",
    "\n",
    "### Data parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_samples = 2**13\n",
    "num_features = 300\n",
    "n_informative = 270\n",
    "random_state = 0\n",
    "train_size = 0.8\n",
    "datatype = np.float32"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learning_rate = 'constant'\n",
    "penalty = 'elasticnet'\n",
    "eta0 = 0.005\n",
    "max_iter = 100\n",
    "fit_intercept = True\n",
    "tol=0.0\n",
    "batch_size=2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate data\n",
    "\n",
    "### Host"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "X_class, y_class = make_classification(n_samples=num_samples, n_features=num_features,\n",
    "                                       n_informative=n_informative, random_state=random_state)\n",
    "# change the datatype of the input data\n",
    "X_class = X_class.astype(datatype)\n",
    "y_class = y_class.astype(datatype)\n",
    "\n",
    "# convert numpy arrays to pandas dataframe\n",
    "X_class = pd.DataFrame(X_class)\n",
    "y_class = pd.DataFrame(y_class)\n",
    "\n",
    "X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class,\n",
    "                                                                            train_size=train_size,\n",
    "                                                                            random_state=random_state)\n",
    "X_reg, y_reg = make_regression(n_samples=num_samples, n_features=num_features,\n",
    "                               n_informative=n_informative, random_state=random_state)\n",
    "\n",
    "# change the datatype of the input data\n",
    "X_reg = X_reg.astype(datatype)\n",
    "y_reg = y_reg.astype(datatype)\n",
    "\n",
    "# convert numpy arrays to pandas dataframe\n",
    "X_reg = pd.DataFrame(X_reg)\n",
    "y_reg = pd.DataFrame(y_reg)\n",
    "\n",
    "X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg,\n",
    "                                                                    train_size=train_size,\n",
    "                                                                    random_state=random_state)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# classification dataset\n",
    "X_class_cudf = gd.DataFrame.from_pandas(X_class_train)\n",
    "X_class_cudf_test = gd.DataFrame.from_pandas(X_class_test)\n",
    "\n",
    "y_class_cudf = gd.Series(y_class_train.values[:,0])\n",
    "\n",
    "# regression dataset\n",
    "X_reg_cudf = gd.DataFrame.from_pandas(X_reg_train)\n",
    "X_reg_cudf_test = gd.DataFrame.from_pandas(X_reg_test)\n",
    "\n",
    "y_reg_cudf = gd.Series(y_reg_train.values[:,0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scikit-learn Model\n",
    "\n",
    "### Classification :\n",
    "\n",
    "#### Fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "skl_sgd_classifier = sklearn.linear_model.SGDClassifier(learning_rate=learning_rate,\n",
    "                                                        eta0=eta0,\n",
    "                                                        max_iter=max_iter,\n",
    "                                                        fit_intercept=fit_intercept,\n",
    "                                                        tol=tol,\n",
    "                                                        penalty=penalty,\n",
    "                                                        random_state=random_state)\n",
    "\n",
    "skl_sgd_classifier.fit(X_class_train, y_class_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "skl_class_pred = skl_sgd_classifier.predict(X_class_test)\n",
    "skl_class_acc = accuracy_score(skl_class_pred, y_class_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scikit-learn Model\n",
    "\n",
    "### Regression :\n",
    "\n",
    "#### Fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "skl_sgd_regressor = sklearn.linear_model.SGDRegressor(learning_rate=learning_rate,\n",
    "                                                      eta0=eta0,\n",
    "                                                      max_iter=max_iter,\n",
    "                                                      fit_intercept=fit_intercept,\n",
    "                                                      tol=tol,\n",
    "                                                      penalty=penalty,\n",
    "                                                      random_state=random_state)\n",
    "\n",
    "skl_sgd_regressor.fit(X_reg_train, y_reg_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "skl_reg_pred = skl_sgd_regressor.predict(X_reg_test)\n",
    "skl_reg_r2 = r2_score(skl_reg_pred, y_reg_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cuML Model\n",
    "\n",
    "### Classification:\n",
    "\n",
    "#### Fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "cu_mbsgd_classifier = cuml.linear_model.MBSGDClassifier(learning_rate=learning_rate,\n",
    "                                                        eta0=eta0,\n",
    "                                                        epochs=max_iter,\n",
    "                                                        fit_intercept=fit_intercept,\n",
    "                                                        batch_size=batch_size,\n",
    "                                                        tol=tol,\n",
    "                                                        penalty=penalty)\n",
    "\n",
    "cu_mbsgd_classifier.fit(X_class_cudf, y_class_cudf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "cu_class_pred = cu_mbsgd_classifier.predict(X_class_cudf_test).to_array()\n",
    "cu_class_acc = accuracy_score(cu_class_pred, y_class_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Regression:\n",
    "\n",
    "#### Fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "cu_mbsgd_regressor = cuml.linear_model.MBSGDRegressor(learning_rate=learning_rate,\n",
    "                                                      eta0=eta0,\n",
    "                                                      epochs=max_iter,\n",
    "                                                      fit_intercept=fit_intercept,\n",
    "                                                      batch_size=batch_size,\n",
    "                                                      tol=tol,\n",
    "                                                      penalty=penalty)\n",
    "\n",
    "cu_mbsgd_regressor.fit(X_reg_cudf, y_reg_cudf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "cu_reg_pred = cu_mbsgd_regressor.predict(X_reg_cudf_test).to_array()\n",
    "cu_reg_r2 = r2_score(cu_reg_pred, y_reg_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate Results\n",
    "\n",
    "### Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Sklearn's R^2 score for classification : %s\" % skl_class_acc)\n",
    "print(\"cuML's R^2 score for classification : %s\" % cu_class_acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Sklearn's R^2 score for regression : %s\" % skl_reg_r2)\n",
    "print(\"cuML's R^2 score for regression : %s\" % cu_reg_r2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
